Agilent 1 Channel Processing

Published

February 3, 2023

1 Validate Parameters

Code
# DEBUG UNTIL CONTAINER UPDATED
chooseCRANmirror(ind = 1)
install.packages('matrixStats', lib=".")
# ENDDEBUG

library(dplyr) # Ensure infix operator is available, methods should still reference dplyr namespace otherwise
options(dplyr.summarise.inform = FALSE) # Don't print out '`summarise()` has grouped output by 'group'. You can override using the `.groups` argument.'
if (is.null(params$runsheet)) {
  stop("PARAMETERIZATION ERROR: Must supply runsheet path")
}

runsheet = params$runsheet # <path/to/runsheet>
annotation_file_path <- as.character(params$annotation_file_path) # Annotation file from 'genelab_annots_link' column of https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv
organism <- params$organism # Used to determine primary keytype
message(params)

2 Load Metadata and Raw Data

Code
print("Loading Runsheet...") # NON_DPPD
[1] "Loading Runsheet..."
Code
# fileEncoding removes strange characters from the column names
df_rs <- read.csv(runsheet, check.names = FALSE, fileEncoding = 'UTF-8-BOM') 
# NON_DPPD:START
print("Here is the embedded runsheet")
[1] "Here is the embedded runsheet"
Code
DT::datatable(df_rs)
Code
print("Here are the expected comparison groups")
[1] "Here are the expected comparison groups"
Code
# NON_DPPD:END
print("Loading Raw Data...") # NON_DPPD
[1] "Loading Raw Data..."
Code
# TODO: priority-low generalize this utility function
allTrue <- function(i_vector) {
  if ( length(i_vector) == 0 ) {
    stop(paste("Input vector is length zero"))
  }
  all(i_vector)
}

# Define paths to raw data files
runsheetPathsAreURIs <- function(df_runsheet) {
  allTrue(stringr::str_starts(df_runsheet$`Array Data File Path`, "https"))
}


# Download raw data files
downloadFilesFromRunsheet <- function(df_runsheet) {
  urls <- df_runsheet$`Array Data File Path`
  destinationFiles <- df_runsheet$`Array Data File Name`

  mapply(function(url, destinationFile) {
    print(paste0("Downloading from '", url, "' TO '", destinationFile, "'"))
    if ( file.exists(destinationFile ) ) {
      warning(paste( "Using Existing File:", destinationFile ))
    } else {
      download.file(url, destinationFile)
    }
  }, urls, destinationFiles)

  destinationFiles # Return these paths
}

if ( runsheetPathsAreURIs(df_rs) ) {
  print("Determined Raw Data Locations are URIS")
  local_paths <- downloadFilesFromRunsheet(df_rs)
} else {
  print("Or Determined Raw Data Locations are local paths")
  local_paths <- df_rs$`Array Data File Path`
}
[1] "Determined Raw Data Locations are URIS"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775174_A_Pre-1_1.txt.gz' TO 'GLDS-542_microarray_GSM2775174_A_Pre-1_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775175_B_Pre-1_1.txt.gz' TO 'GLDS-542_microarray_GSM2775175_B_Pre-1_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775176_C_Pre-1_1.txt.gz' TO 'GLDS-542_microarray_GSM2775176_C_Pre-1_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775177_D_Pre-1_1.txt.gz' TO 'GLDS-542_microarray_GSM2775177_D_Pre-1_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775178_E_Pre-1_1.txt.gz' TO 'GLDS-542_microarray_GSM2775178_E_Pre-1_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775179_F_Pre-1_1.txt.gz' TO 'GLDS-542_microarray_GSM2775179_F_Pre-1_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775180_G_Pre-1_1.txt.gz' TO 'GLDS-542_microarray_GSM2775180_G_Pre-1_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775181_A_C14_1.txt.gz' TO 'GLDS-542_microarray_GSM2775181_A_C14_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775182_B_C14_1.txt.gz' TO 'GLDS-542_microarray_GSM2775182_B_C14_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775183_C_C14_1.txt.gz' TO 'GLDS-542_microarray_GSM2775183_C_C14_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775184_D_C14_1.txt.gz' TO 'GLDS-542_microarray_GSM2775184_D_C14_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775185_E_C14_1.txt.gz' TO 'GLDS-542_microarray_GSM2775185_E_C14_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775186_F_C14_1.txt.gz' TO 'GLDS-542_microarray_GSM2775186_F_C14_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775187_G_C14_1.txt.gz' TO 'GLDS-542_microarray_GSM2775187_G_C14_1.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775188_A_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775188_A_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775189_B_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775189_B_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775190_C_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775190_C_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775191_D_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775191_D_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775192_E_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775192_E_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775193_F_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775193_F_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775194_G_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775194_G_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775195_H_Pre-1_2.txt.gz' TO 'GLDS-542_microarray_GSM2775195_H_Pre-1_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775196_A_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775196_A_C14_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775197_B_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775197_B_C14_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775198_C_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775198_C_C14_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775199_D_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775199_D_C14_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775200_E_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775200_E_C14_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775201_F_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775201_F_C14_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775202_G_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775202_G_C14_2.txt.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-542/download?source=datamanager&file=GLDS-542_microarray_GSM2775203_H_C14_2.txt.gz' TO 'GLDS-542_microarray_GSM2775203_H_C14_2.txt.gz'"
Code
# Decompress files if needed
if ( allTrue(stringr::str_ends(local_paths, ".gz")) ) {
  print("Determined these files are gzip compressed... Decompressing now")
  # This does the decompression
  lapply(local_paths, R.utils::gunzip, remove = FALSE, overwrite = TRUE)
  # This removes the .gz extension to get the decompressed filenames
  local_paths <- vapply(local_paths, 
                        stringr::str_replace, # Run this function against each item in 'local_paths'
                        FUN.VALUE = character(1),  # Execpt an character vector as a return
                        USE.NAMES = FALSE,  # Don't use the input to assign names for the returned list
                        pattern = ".gz$", # first argument for applied function
                        replacement = ""  # second argument for applied function
                        )
}
[1] "Determined these files are gzip compressed... Decompressing now"
Code
df_local_paths <- data.frame(`Sample Name` = df_rs$`Sample Name`, `Local Paths` = local_paths, check.names = FALSE)
# NON_DPPD:START
print("Raw Data Loaded Successfully")
[1] "Raw Data Loaded Successfully"
Code
DT::datatable(df_local_paths)
Code
# NON_DPPD:END


# Load raw data into R object
raw_data <- limma::read.maimages(df_local_paths$`Local Paths`, 
                                 source = "agilent",  # Specify platform
                                 green.only = TRUE, # Specify one-channel design
                                 names = df_local_paths$`Sample Name` # Map column names as Sample Names (instead of default filenames)
                                 )
Read GLDS-542_microarray_GSM2775174_A_Pre-1_1.txt 
Read GLDS-542_microarray_GSM2775175_B_Pre-1_1.txt 
Read GLDS-542_microarray_GSM2775176_C_Pre-1_1.txt 
Read GLDS-542_microarray_GSM2775177_D_Pre-1_1.txt 
Read GLDS-542_microarray_GSM2775178_E_Pre-1_1.txt 
Read GLDS-542_microarray_GSM2775179_F_Pre-1_1.txt 
Read GLDS-542_microarray_GSM2775180_G_Pre-1_1.txt 
Read GLDS-542_microarray_GSM2775181_A_C14_1.txt 
Read GLDS-542_microarray_GSM2775182_B_C14_1.txt 
Read GLDS-542_microarray_GSM2775183_C_C14_1.txt 
Read GLDS-542_microarray_GSM2775184_D_C14_1.txt 
Read GLDS-542_microarray_GSM2775185_E_C14_1.txt 
Read GLDS-542_microarray_GSM2775186_F_C14_1.txt 
Read GLDS-542_microarray_GSM2775187_G_C14_1.txt 
Read GLDS-542_microarray_GSM2775188_A_Pre-1_2.txt 
Read GLDS-542_microarray_GSM2775189_B_Pre-1_2.txt 
Read GLDS-542_microarray_GSM2775190_C_Pre-1_2.txt 
Read GLDS-542_microarray_GSM2775191_D_Pre-1_2.txt 
Read GLDS-542_microarray_GSM2775192_E_Pre-1_2.txt 
Read GLDS-542_microarray_GSM2775193_F_Pre-1_2.txt 
Read GLDS-542_microarray_GSM2775194_G_Pre-1_2.txt 
Read GLDS-542_microarray_GSM2775195_H_Pre-1_2.txt 
Read GLDS-542_microarray_GSM2775196_A_C14_2.txt 
Read GLDS-542_microarray_GSM2775197_B_C14_2.txt 
Read GLDS-542_microarray_GSM2775198_C_C14_2.txt 
Read GLDS-542_microarray_GSM2775199_D_C14_2.txt 
Read GLDS-542_microarray_GSM2775200_E_C14_2.txt 
Read GLDS-542_microarray_GSM2775201_F_C14_2.txt 
Read GLDS-542_microarray_GSM2775202_G_C14_2.txt 
Read GLDS-542_microarray_GSM2775203_H_C14_2.txt 
Code
# Summarize raw data
print("Summarized Raw Data Below") # NON_DPPD
[1] "Summarized Raw Data Below"
Code
print(paste0("Number of Arrays: ", dim(raw_data)[2]))
[1] "Number of Arrays: 30"
Code
print(paste0("Number of Probes: ", dim(raw_data)[1]))
[1] "Number of Probes: 44495"
Code
message(paste0("Number of Arrays: ", dim(raw_data)[2])) # NON_DPPD
message(paste0("Number of Probes: ", dim(raw_data)[1])) # NON_DPPD
# NON_DPPD:START
DT::datatable(raw_data$targets, caption = "Sample to File Mapping")
Code
DT::datatable(head(raw_data$genes, n = 20), caption = "First 20 rows of raw data file embedded probes to genes table")
Code
# NON_DPPD:END

3 QA For Raw Data

3.1 Density Plot

Code
limma::plotDensities(raw_data, 
                     log = TRUE, 
                     legend = "topright")

Density of raw intensities for each array. These are raw intensity values with background intensity values subtracted. A lack of overlap indicates a need for normalization.

3.2 Pseudo Image Plots

Code
agilentImagePlot <- function(eListRaw) {
  # Adapted from this discussion: https://support.bioconductor.org/p/15523/
  copy_raw_data <- eListRaw
  copy_raw_data$genes$Block <- 1 # Agilent arrays only have one block
  names(copy_raw_data$genes)[2] <- "Column"
  copy_raw_data$printer <- limma::getLayout(copy_raw_data$genes)

  r <- copy_raw_data$genes$Row
  c <- copy_raw_data$genes$Column
  nr <- max(r)
  nc <- max(c)
  y <- rep(NA,nr*nc)
  i <- (r-1)*nc+c
  for ( array_i in seq(colnames(copy_raw_data$E)) ) {
    y[i] <- log2(copy_raw_data$E[,array_i])
    limma::imageplot(y,copy_raw_data$printer, main = rownames(copy_raw_data$targets)[array_i])
  }
}

agilentImagePlot(raw_data)

3.3 MA Plots

Code
for ( array_i in seq(colnames(raw_data$E)) ) {
  message(glue::glue("MA Plot for array: {array_i} of {length(colnames(raw_data$E))}")) # NON_DPPD
  sample_name <- rownames(raw_data$targets)[array_i]
  limma::plotMA(raw_data,array=array_i,xlab="Average log-expression",ylab="Expression log-ratio (this sample vs. others)", main = sample_name, status=raw_data$genes$ControlType)
}

3.4 Foreground-Background Plots

Code
for ( array_i in seq(colnames(raw_data$E)) ) {
  message(glue::glue("FB Plot for array: {array_i} of {length(colnames(raw_data$E))}")) # NON_DPPD
  sample_name <- rownames(raw_data$targets)[array_i]
  limma::plotFB(raw_data, array = array_i, xlab = "log2 Background", ylab = "log2 Foreground", main = sample_name) 
}

3.5 Boxplots

Code
boxplotExpressionSafeMargin <- function(data) {
  # NON_DPPD:START
  #' plot boxplots of expression values
  #'
  #' Ensures the plot labels are vertical and fit the plot
  #' @param data: limma::EListRaw or limma::EList
  # NON_DPPD:END
  longest_sample_name_length <- max(nchar(rownames(data$targets))) * 1
  bottom_margin <- min(35, longest_sample_name_length)
  par(mar=c(bottom_margin,2,1,1))
  boxplot(log2(data$E), las=2)
}

boxplotExpressionSafeMargin(raw_data)

4 Background Correction

Code
norm_data <- limma::backgroundCorrect(raw_data, method = "normexp")
Array 1 corrected
Array 2 corrected
Array 3 corrected
Array 4 corrected
Array 5 corrected
Array 6 corrected
Array 7 corrected
Array 8 corrected
Array 9 corrected
Array 10 corrected
Array 11 corrected
Array 12 corrected
Array 13 corrected
Array 14 corrected
Array 15 corrected
Array 16 corrected
Array 17 corrected
Array 18 corrected
Array 19 corrected
Array 20 corrected
Array 21 corrected
Array 22 corrected
Array 23 corrected
Array 24 corrected
Array 25 corrected
Array 26 corrected
Array 27 corrected
Array 28 corrected
Array 29 corrected
Array 30 corrected

5 Between Array Normalization

Code
# Normalize background-corrected data using the quantile method
norm_data <- limma::normalizeBetweenArrays(norm_data, method = "quantile")
print("Summarized Normalized Data Below") # NON_DPPD
[1] "Summarized Normalized Data Below"
Code
print("Note: These are expected to be the same values as the normalized data since no filtering/summarization has been performed") # NON_DPPD
[1] "Note: These are expected to be the same values as the normalized data since no filtering/summarization has been performed"
Code
# Summarize background-corrected and normalized data
print(paste0("Number of Arrays: ", dim(norm_data)[2]))
[1] "Number of Arrays: 30"
Code
print(paste0("Number of Probes: ", dim(norm_data)[1]))
[1] "Number of Probes: 44495"
Code
message(paste0("Number of Arrays: ", dim(norm_data)[2])) # NON_DPPD
message(paste0("Number of Probes: ", dim(norm_data)[1])) # NON_DPPD
# NON_DPPD:START
DT::datatable(norm_data$targets, caption = "Sample to File Mapping")
Code
DT::datatable(head(norm_data$genes, n = 20), caption = "First 20 rows of normalized data file embedded probes to genes table")
Code
# NON_DPPD:END

6 Normalized Data Quality Assessment

6.1 Density Plot

Code
limma::plotDensities(norm_data, 
                     log = TRUE, 
                     legend = "topright")

Density of norm intensities for each array. Near complete overlap is expected after normalization.

6.2 Pseudo Image Plots

Code
agilentImagePlot(norm_data)

6.3 MA Plots

Code
for ( array_i in seq(colnames(norm_data$E)) ) {
  sample_name <- rownames(norm_data$targets)[array_i]
  limma::plotMA(norm_data,array=array_i,xlab="Average log-expression",ylab="Expression log-ratio (this sample vs. others)", main = sample_name, status=norm_data$genes$ControlType)
}

6.4 Boxplots

Code
boxplotExpressionSafeMargin(norm_data)

7 Perform Probeset Differential Expression

7.1 Approach Motivation

Based on bioconductor discussions with Gordon Smyth (“[Smyth’s] research group created the limma, edgeR, goseq, Rsubread, csaw and diffHic packages.”) here: https://support.bioconductor.org/p/116616/#116674

Summarization using avereps is ‘deliberately designed for cases when each probe is associated with exactly one gene’.

Given good quality gene annotations (which this analysis should have by using bioMart Ensembl Gene annotations), ‘most Agilent probes should map to a unique gene’.

Based on this discussion, the folowing approach is utilized:

  1. Add biomart annotations first
  2. Compute all mapping statistics (i.e. multimapping (one-probe to many-genes), redudant mapping (many-probes to one-gene))
  3. Perform DE with and without filtering (assessing effects of filtering on row-wise results)

Don’t summarize the results from the probe level since any probe with a single gene mapping being DE implies the gene a DEG. https://support.bioconductor.org/p/93796/#116605

7.2 Probe Differential Expression (DE)

7.2.1 Add Probe Annotations

Code
shortenedOrganismName <- function(long_name) {
  #' Convert organism names like 'Homo Sapiens' into 'hsapiens'
  tokens <- long_name %>% stringr::str_split(" ", simplify = TRUE)
  genus_name <- tokens[1]

  species_name <- tokens[2]

  short_name <- stringr::str_to_lower(paste0(substr(genus_name, start = 1, stop = 1), species_name))

  return(short_name)
}


# locate dataset
expected_dataset_name <- shortenedOrganismName(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl")
print(paste0("Expected dataset name: '", expected_dataset_name, "'"))
[1] "Expected dataset name: 'hsapiens_gene_ensembl'"
Code
message(paste0("Expected dataset name: '", expected_dataset_name, "'")) # NON_DPPD


# Specify Ensembl version used in current GeneLab reference annotations
ENSEMBL_VERSION <- '107'
print(paste0("Searching for Ensembl Version: ", ENSEMBL_VERSION)) # NON_DPPD
[1] "Searching for Ensembl Version: 107"
Code
ensembl <- biomaRt::useEnsembl(biomart = "genes", 
                               dataset = expected_dataset_name,
                               version = ENSEMBL_VERSION)
Warning in listEnsemblArchives(https = FALSE): Ensembl will soon enforce the use of https.
As such the 'https' argument will be deprecated in the next release.
Code
print(ensembl)
Object of class 'Mart':
  Using the ENSEMBL_MART_ENSEMBL BioMart database
  Using the hsapiens_gene_ensembl dataset
Code
getBioMartAttribute <- function(df_rs, params) {
  #' Returns resolved biomart attribute
  # NON_DPPD:START
  #' this either comes from the runsheet or as a fall back, the parameters injected during render
  #' if neither exist, an error is thrown
  # NON_DPPD:END

  # check if runsheet has Array Design REF
  if ( !is.null(df_rs$`biomart_attribute`) ) {
    print("Using attribute name sourced from runsheet")
    # Format according to biomart needs
    formatted_value <- unique(df_rs$`biomart_attribute`) %>% 
                        stringr::str_replace_all(" ","_") %>% # Replace all spaces with underscore
                        stringr::str_to_lower() # Lower casing only
    return(formatted_value)
  } else {
    print("Could not find 'Array Design REF' in runsheet, falling back to parameters")
  }

  # check if a fallback has been given via params
  if ( !is.null(params$biomart_attribute ) ) {
    print("Using attribute name sourced from parameters")
    return(params$biomart_attribute)
  }

  # finally throw an error if neither guard condition was true
  stop("No valid biomart attribute identified")
}

expected_attribute_name <- getBioMartAttribute(df_rs, params)
[1] "Using attribute name sourced from runsheet"
Code
print(paste0("Expected attribute name: '", expected_attribute_name, "'"))
[1] "Expected attribute name: 'agilent_wholegenome_4x44k_v2'"
Code
message(paste0("Expected attribute name: '", expected_attribute_name, "'")) # NON_DPPD

probe_ids <- unique(norm_data$genes$ProbeName)

# DEBUG:START
if ( is.integer(params$DEBUG_limit_biomart_query) ) {
  warning(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries"))
  message(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries"))
  probe_ids <- probe_ids[1:params$DEBUG_limit_biomart_query]
}
Warning: DEBUG MODE: Limiting query to 300 entries
Code
# DEBUG:END

# Create probe map
# Run Biomart Queries in chunks to prevent request timeouts
#   Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size
CHUNK_SIZE= 8000
probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE))
df_mapping <- data.frame()
for (i in seq_along(probe_id_chunks)) {
  probe_id_chunk <- probe_id_chunks[[i]]
  print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})"))
  message(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})")) # NON_DPPD
  chunk_results <- biomaRt::getBM(
      attributes = c(
          expected_attribute_name,
          "ensembl_gene_id"
          ), 
          filters = expected_attribute_name, 
          values = probe_id_chunk, 
          mart = ensembl)

  df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results)
  Sys.sleep(10) # Slight break between requests to prevent back-to-back requests
}
Running biomart query chunk 1 of 1. Total probes IDS in query (300)
Code
# Convert list of multi-mapped genes to string
listToUniquePipedString <- function(str_list) {
  #! convert lists into strings denoting unique elements separated by '|' characters
  #! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3"
  return(toString(unique(str_list)) %>% stringr::str_replace_all(pattern = stringr::fixed(", "), replacement = "|"))
}

unique_probe_ids <- df_mapping %>% 
                      # note: '!!sym(VAR)' syntax allows usage of variable 'VAR' in dplyr functions due to NSE. ref: https://dplyr.tidyverse.org/articles/programming.html # NON_DPPD
                      dplyr::group_by(!!sym(expected_attribute_name)) %>% 
                      dplyr::summarise(
                        ENSEMBL = listToUniquePipedString(ensembl_gene_id)
                        ) %>%
                      # Count number of ensembl IDS mapped
                      dplyr::mutate( 
                        count_ENSEMBL_mappings = 1 + stringr::str_count(ENSEMBL, stringr::fixed("|"))
                      )

norm_data$genes <- norm_data$genes %>% 
  dplyr::left_join(unique_probe_ids, by = c("ProbeName" = expected_attribute_name ) ) %>%
  dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) )

7.3 Summarize Biomart Mapping vs. Manufacturer Mapping

Code
# Pie Chart with Percentages
slices <- c(
    'Control probes' = nrow(norm_data$gene %>% dplyr::filter(ControlType != 0) %>% dplyr::distinct(ProbeName)), 
    'Unique Mapping' = nrow(norm_data$gene %>% dplyr::filter(ControlType == 0) %>% dplyr::filter(count_ENSEMBL_mappings == 1) %>% dplyr::distinct(ProbeName)), 
    'Multi Mapping' = nrow(norm_data$gene %>% dplyr::filter(ControlType == 0) %>% dplyr::filter(count_ENSEMBL_mappings > 1) %>% dplyr::distinct(ProbeName)), 
    'No Mapping' = nrow(norm_data$gene %>% dplyr::filter(ControlType == 0) %>% dplyr::filter(count_ENSEMBL_mappings == 0) %>% dplyr::distinct(ProbeName))
)
pct <- round(slices/sum(slices)*100)
chart_names <- names(slices)
chart_names <- glue::glue("{names(slices)} ({slices})") # add count to labelss
chart_names <- paste(chart_names, pct) # add percents to labels
chart_names <- paste(chart_names,"%",sep="") # ad % to labels
pie(slices,labels = chart_names, col=rainbow(length(slices)),
    main=glue::glue("Biomart Mapping to Ensembl Primary Keytype\n {nrow(norm_data$gene %>% dplyr::distinct(ProbeName))} Total Unique Probes")
    )

Code
original_mapping_rate = nrow(norm_data$gene %>% dplyr::filter(ControlType == 0) %>% dplyr::filter(ProbeName != SystematicName) %>% dplyr::distinct(ProbeName))
print(glue::glue("Original Manufacturer Reported Mapping Rate: {original_mapping_rate}"))
Original Manufacturer Reported Mapping Rate: 32365
Code
print(glue::glue("Biomart Unique Mapping Rate: {original_mapping_rate}"))
Biomart Unique Mapping Rate: 32365
Code
message(glue::glue("Original Manufacturer Reported Mapping Rate: {original_mapping_rate}")) # NON_DPPD
message(glue::glue("Biomart Unique Mapping Rate: {slices[['Unique Mapping']]}")) # NON_DPPD

7.4 Generate Design Matrix

Code
runsheetToDesignMatrix <- function(runsheet_path) {
    df = read.csv(runsheet_path)
    # get only Factor Value columns
    factors = as.data.frame(df[,grep("Factor.Value", colnames(df), ignore.case=TRUE)])
    colnames(factors) = paste("factor",1:dim(factors)[2], sep= "_")
    
    # Load metadata from runsheet csv file
    compare_csv = data.frame(sample_id = df[,c("Sample.Name")], factors)

    # Create data frame containing all samples and respective factors
    study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]])
    colnames(study) <- colnames(compare_csv)[2:dim(compare_csv)[2]]
    rownames(study) <- compare_csv[,1] 
    
    # Format groups and indicate the group that each sample belongs to
    if (dim(study)[2] >= 2){
        group<-apply(study,1,paste,collapse = " & ") # concatenate multiple factors into one condition per sample
    } else{
        group<-study[,1]
    }
    group_names <- paste0("(",group,")",sep = "") # human readable group names
    group <- sub("^BLOCKER_", "",  make.names(paste0("BLOCKER_", group))) # group naming compatible with R models, this maintains the default behaviour of make.names with the exception that 'X' is never prepended to group namesnames(group) <- group_names
    names(group) <- group_names

    # Format contrasts table, defining pairwise comparisons for all groups
    contrast.names <- combn(levels(factor(names(group))),2) # generate matrix of pairwise group combinations for comparison
    contrasts <- apply(contrast.names, MARGIN=2, function(col) sub("^BLOCKER_", "",  make.names(paste0("BLOCKER_", stringr::str_sub(col, 2, -2)))))
    contrast.names <- c(paste(contrast.names[1,],contrast.names[2,],sep = "v"),paste(contrast.names[2,],contrast.names[1,],sep = "v")) # format combinations for output table files names
    contrasts <- cbind(contrasts,contrasts[c(2,1),])
    colnames(contrasts) <- contrast.names
    sampleTable <- data.frame(condition=factor(group))
    rownames(sampleTable) <- df[,c("Sample.Name")]

    condition <- sampleTable[,'condition']
    names_mapping <- as.data.frame(cbind(safe_name = as.character(condition), original_name = group_names))

    design <- model.matrix(~ 0 + condition)
    design_data <- list( matrix = design, mapping = names_mapping, groups = as.data.frame( cbind(sample = df[,c("Sample.Name")], group = group_names) ), contrasts = contrasts )
    return(design_data)
}


# Loading metadata from runsheet csv file
design_data <- runsheetToDesignMatrix(runsheet)
design <- design_data$matrix

# Write SampleTable.csv and contrasts.csv file
write.csv(design_data$groups, "SampleTable.csv")
write.csv(design_data$contrasts, "contrasts.csv")

7.5 Perform Individual Probe Level DE

Code
lmFitPairwise <- function(norm_data, design) {
    #' Perform all pairwise comparisons

    #' Approach based on limma manual section 17.4 (version 3.52.4)

    fit <- limma::lmFit(norm_data, design)

    # Create Contrast Model
    fit.groups <- colnames(fit$design)[which(fit$assign == 1)]
    combos <- combn(fit.groups,2)
    contrasts<-c(paste(combos[1,],combos[2,],sep = "-"),paste(combos[2,],combos[1,],sep = "-")) # format combinations for limma:makeContrasts
    cont.matrix <- limma::makeContrasts(contrasts=contrasts,levels=design)
    contrast.fit <- limma::contrasts.fit(fit, cont.matrix)

    contrast.fit <- limma::eBayes(contrast.fit,trend=TRUE,robust=TRUE)
    return(contrast.fit)
}

# Calculate results
res <- lmFitPairwise(norm_data, design)
DT::datatable(limma::topTable(res)) # NON_DPPD
Code
# Print DE table, without filtering
limma::write.fit(res, adjust = 'BH', 
                file = "INTERIM.csv",
                row.names = FALSE,
                quote = TRUE,
                sep = ",")

7.6 Add Additional Columns and Format DE Table

Code
## Reformat Table for consistency across DE analyses tables within GeneLab ##

# Read in DE table 
df_interim <- read.csv("INTERIM.csv")

# Reformat column names
reformat_names <- function(colname, group_name_mapping) {
  # NON_DPPD:START
  #! Converts from:
  #!    "P.value.adj.conditionWild.Type...Space.Flight...1st.generation.conditionWild.Type...Ground.Control...4th.generation"
  #! to something like:
  #! "Adj.p.value(Wild Type & Space Flight & 1st generation)v(Wild Type & Ground Control & 4th generation)"
  #! Since two groups are expected to be replace, ensure replacements happen in pairs

  # Remove 'condition' from group names
  ## This was introduced while creating design matrix
  # Rename other columns for consistency across genomics related DE outputs
  # NON_DPPD:END
  new_colname <- colname  %>% 
                  stringr::str_replace(pattern = "^P.value.adj.condition", replacement = "Adj.p.value_") %>%
                  stringr::str_replace(pattern = "^P.value.condition", replacement = "P.value_") %>%
                  stringr::str_replace(pattern = "^Coef.condition", replacement = "Log2fc_") %>% # This is the Log2FC as per: https://rdrr.io/bioc/limma/man/writefit.html
                  stringr::str_replace(pattern = "^t.condition", replacement = "T.stat_") %>%
                  stringr::str_replace(pattern = stringr::fixed("Genes.ProbeName"), replacement = "PROBEID") %>% 
                  stringr::str_replace(pattern = stringr::fixed("Genes.SYMBOL"), replacement = "SYMBOL") %>% 
                  stringr::str_replace(pattern = stringr::fixed("Genes.ENSEMBL"), replacement = "ENSEMBL") %>% 
                  stringr::str_replace(pattern = stringr::fixed("Genes.GOSLIM_IDS"), replacement = "GOSLIM_IDS") %>% 
                  stringr::str_replace(pattern = ".condition", replacement = "v")
  
  # remap to group names before make.names was applied
  for ( i in seq(nrow(group_name_mapping)) ) {
    safe_name <- group_name_mapping[i,]$safe_name
    original_name <- group_name_mapping[i,]$original_name
    new_colname <- new_colname %>% stringr::str_replace(pattern = stringr::fixed(safe_name), replacement = original_name)
  }

  return(new_colname)
}

df_interim <- df_interim %>% dplyr::rename_with( reformat_names, group_name_mapping = design_data$mapping )


# Concatenate expression values for each sample
df_interim <- df_interim %>% dplyr::bind_cols(norm_data$E)


## Add Group Wise Statistics ##

# Group mean and standard deviations for normalized expression values are computed and added to the table

unique_groups <- unique(design_data$group$group)
for ( i in seq_along(unique_groups) ) {
  current_group <- unique_groups[i]
  current_samples <- design_data$group %>% 
                      dplyr::group_by(group) %>%
                      dplyr::summarize(
                        samples = sort(unique(sample))
                      ) %>%
                      dplyr::filter(
                        group == current_group
                      ) %>% 
                      dplyr::pull()
                    
  print(glue::glue("Computing mean and standard deviation for Group {i} of {length(unique_groups)}"))
  print(glue::glue("Group: {current_group}"))
  print(glue::glue("Samples in Group: '{toString(current_samples)}'"))
  # NON_DPPD:START
  message(glue::glue("Computing mean and standard deviation for Group {i} of {length(unique_groups)}"))
  message(glue::glue("Group: {current_group}"))
  message(glue::glue("Samples in Group: '{toString(current_samples)}'"))
  # NON_DPPD:END
  
  df_interim <- df_interim %>% 
    dplyr::mutate( 
      "Group.Mean_{current_group}" := rowMeans(select(., all_of(current_samples))),
      "Group.Stdev_{current_group}" := matrixStats::rowSds(as.matrix(select(., all_of(current_samples)))),
      ) %>% 
    dplyr::ungroup() %>%
    as.data.frame()
}
Computing mean and standard deviation for Group 1 of 2
Group: (pre-confinement)
Samples in Group: 'GSM2775174, GSM2775175, GSM2775176, GSM2775177, GSM2775178, GSM2775179, GSM2775180, GSM2775188, GSM2775189, GSM2775190, GSM2775191, GSM2775192, GSM2775193, GSM2775194, GSM2775195'
Computing mean and standard deviation for Group 2 of 2
Group: (post-confinement for 14 days)
Samples in Group: 'GSM2775181, GSM2775182, GSM2775183, GSM2775184, GSM2775185, GSM2775186, GSM2775187, GSM2775196, GSM2775197, GSM2775198, GSM2775199, GSM2775200, GSM2775201, GSM2775202, GSM2775203'
Code
## Compute all sample mean and standard deviation
print(glue::glue("Computing mean and standard deviation for all samples"))
Computing mean and standard deviation for all samples
Code
# NON_DPPD:START
message(glue::glue("Computing mean and standard deviation for all samples"))
# NON_DPPD:END
all_samples <- design_data$group %>% dplyr::pull(sample)
df_interim <- df_interim %>% 
  dplyr::mutate( 
    "All.mean" := rowMeans(select(., all_of(all_samples))),
    "All.stdev" := matrixStats::rowSds(as.matrix(select(., all_of(all_samples)))),
    ) %>% 
  dplyr::ungroup() %>%
  as.data.frame()

print("Remove extra columns from final table")
[1] "Remove extra columns from final table"
Code
# These columns are data mapped to column PROBEID as per the original Manufacturer and can be linked as needed
colnames_to_remove = c(
  "Genes.Row",
  "Genes.Col",
  "Genes.Start",
  "Genes.Sequence",
  "Genes.ControlType",
  "Genes.ProbeName",
  "Genes.GeneName",
  "Genes.SystematicName",
  "Genes.Description",
  "AveExpr" # Replaced by 'All.mean' column
  # "Genes.count_ENSEMBL_mappings", Keep this
)

df_interim <- df_interim %>% dplyr::select(-any_of(colnames_to_remove))

## Concatenate annotations for genes (for uniquely mapped probes) ##
### Read in annotation table for the appropriate organism ###
annot <- read.table(
            annotation_file_path,
            sep = "\t",
            header = TRUE,
            quote = "",
            comment.char = "",
        )

# Join annotation table and uniquely mapped data

# Determine appropriate keytype
map_primary_keytypes <- c(
  'Caenorhabditis elegans' = 'ENSEMBL',
  'Danio rerio' = 'ENSEMBL',
  'Drosophila melanogaster' = 'ENSEMBL',
  'Rattus norvegicus' = 'ENSEMBL',
  'Saccharomyces cerevisiae' = 'ENSEMBL',
  'Homo sapiens' = 'ENSEMBL',
  'Mus sapiens' = 'ENSEMBL',
  'Arabidopsis thaliana ' = 'TAIR'
)

df_interim <- merge(
                annot,
                df_interim,
                by = map_primary_keytypes[[organism]],
                # ensure all original dge rows are kept.
                # If unmatched in the annotation database, then fill missing with NAN
                all.y = TRUE
            )

# Save to file
write.csv(df_interim, "differential_expression.csv", row.names = FALSE)

### Add columns needed to generate GeneLab visualization plots
## Add column to indicate the sign (positive/negative) of log2fc for each pairwise comparison
df <- df_interim
updown_table <- sign(df[,grep("Log2fc_",colnames(df))])
colnames(updown_table) <- gsub("Log2fc","Updown",grep("Log2fc_",colnames(df),value = TRUE))
df <- cbind(df,updown_table)
rm(updown_table)
## Add column to indicate contrast significance with p <= 0.1
sig.1_table <- df[,grep("P.value_",colnames(df))]<=.1
colnames(sig.1_table) <- gsub("P.value","Sig.1",grep("P.value_",colnames(df),value = TRUE))
df <- cbind(df,sig.1_table)
rm(sig.1_table)
## Add column to indicate contrast significance with p <= 0.05
sig.05_table <- df[,grep("P.value_",colnames(df))]<=.05
colnames(sig.05_table) <- gsub("P.value","Sig.05",grep("P.value_",colnames(df),value = TRUE))
df <- cbind(df, sig.05_table)
rm(sig.05_table)
## Add columns for the volcano plot with p-value and adjusted p-value
log_pval_table <- log2(df[,grep("P.value_", colnames(df))])
colnames(log_pval_table) <- paste0("Log2_", colnames(log_pval_table))
df <- cbind(df, log_pval_table)
rm(log_pval_table)
log_adj_pval_table <- log2(df[,grep("Adj.p.value_", colnames(df))])
colnames(log_adj_pval_table) <- paste0("Log2_", colnames(log_adj_pval_table))
df <- cbind(df, log_adj_pval_table)
rm(log_adj_pval_table)

write.csv(df,
          row.names = FALSE,
          "visualization_output_table.csv"
          )

### Generate and export PCA table for GeneLab visualization plots
## Only use positive expression values, negative values can make up a small portion ( < 0.5% ) of normalized expression values and cannot be log transformed
exp_raw <- log2(norm_data$E) # negatives get converted to NA
Warning: NaNs produced
Code
exp_raw <- na.omit(norm_data$E)
PCA_raw <- prcomp(t(exp_raw), scale = FALSE)
write.csv(PCA_raw$x,
          "visualization_PCA_table.csv"
          )

8 Version Reporting

Code
## print session info ##
print("Session Info below: ")
[1] "Session Info below: "
Code
R version 4.1.3 (2022-03-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.6 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.21.so

locale:
[1] C

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dplyr_1.0.10

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.9             prettyunits_1.1.1      png_0.1-8             
 [4] Biostrings_2.62.0      assertthat_0.2.1       digest_0.6.31         
 [7] utf8_1.2.2             BiocFileCache_2.2.0    R6_2.5.1              
[10] GenomeInfoDb_1.30.1    stats4_4.1.3           RSQLite_2.2.20        
[13] evaluate_0.19          httr_1.4.4             pillar_1.8.1          
[16] zlibbioc_1.40.0        rlang_1.0.6            progress_1.2.2        
[19] curl_4.3.3             jquerylib_0.1.4        blob_1.2.3            
[22] S4Vectors_0.32.4       R.utils_2.12.2         R.oo_1.25.0           
[25] DT_0.26                rmarkdown_2.20         splines_4.1.3         
[28] statmod_1.5.0          stringr_1.5.0          htmlwidgets_1.6.1     
[31] RCurl_1.98-1.9         bit_4.0.5              biomaRt_2.50.0        
[34] compiler_4.1.3         xfun_0.36              pkgconfig_2.0.3       
[37] BiocGenerics_0.40.0    htmltools_0.5.4        tidyselect_1.2.0      
[40] KEGGREST_1.34.0        tibble_3.1.8           GenomeInfoDbData_1.2.7
[43] matrixStats_0.63.0     IRanges_2.28.0         XML_3.99-0.13         
[46] fansi_1.0.3            withr_2.5.0            dbplyr_2.2.1          
[49] crayon_1.5.2           rappdirs_0.3.3         bitops_1.0-7          
[52] R.methodsS3_1.8.2      jsonlite_1.8.4         lifecycle_1.0.3       
[55] DBI_1.1.3              magrittr_2.0.3         cli_3.6.0             
[58] stringi_1.7.12         cachem_1.0.6           XVector_0.34.0        
[61] limma_3.50.3           xml2_1.3.3             bslib_0.4.2           
[64] filelock_1.0.2         ellipsis_0.3.2         generics_0.1.3        
[67] vctrs_0.5.1            tools_4.1.3            bit64_4.0.5           
[70] Biobase_2.54.0         glue_1.6.2             purrr_1.0.1           
[73] hms_1.1.2              crosstalk_1.2.0        fastmap_1.1.0         
[76] yaml_2.3.6             AnnotationDbi_1.56.2   memoise_2.0.1         
[79] knitr_1.41             sass_0.4.4            
Code
## Log same info into versions.txt file
version_output_fn <- "versions.txt"
cat(capture.output(sessionInfo()),
    "BioC_version_associated_with_R_version",
    toString(tools:::.BioC_version_associated_with_R_version()),
    file = version_output_fn,
    append = TRUE,
    sep = "\n")
Code
#save.image(file = paste0(params$id, ".RData")) # DEBUG